The (interactive) correlation heatmap reveals very high correlation among TG compounds.
load("./data.Rdata")
X <- df[df$Diagnosis == "Probable AD", 1:230]
C <- round(cor(X), 2)
heatmaply_cor(C, color = viridis, plot_method = "plotly", dendrogram = F, reorderfun = sort.default(d,w), main = "Correlation Heatmap", file = "heatmap.html", colorbar_thickness = 15, colorbar_len = 0.5)
Performing a Global Test on the serum metabolites of AD patients, correcting for sex (Ho: E4 status has no effect on metabolite levels, Ha: it has an effect), yields a significant difference (p = 0.046) between E4 carriers and non-carriers. Testing if the counts of E4 alleles have an effect on metabolite levels showed no significant nuance.
load("data.Rdata")
gdf <- subset(df, Diagnosis == "Probable AD")
gdf$sex <- as.numeric(gdf$sex) -1
# Binary outcome
gt.b <- globaltest::gt(E4 ~ 1, E4 ~ . - APOE - Diagnosis, data = gdf)
gt.b
p-value Statistic Expected Std.dev #Cov
1 0.0467 2.02 0.794 0.611 231
# Multinomial outcome
gdf$APOE <- as.factor(gdf$APOE)
gt.m <- globaltest::gt(APOE ~ 1, APOE ~ . - E4 -Diagnosis, data = gdf)
gt.m
p-value Statistic Expected Std.dev #Cov
1 0.135 1.25 0.8 0.443 231
Triglycerides and diglycerides seem to survive FDR control.
load("ADdata.Rdata")
# Store all observations of Class 1 in C1
C1 <- ADmets[geno$APOEb == "E4NO", 1:230]
# Store all observations of Class 2 in C2
C2 <- ADmets[geno$APOEb == "E4YES", 1:230]
# Create a function to perform the Wilcoxon rank-sum (Mann-Whitney U) test on two vectors
MannWhitneyU <- function(x, y) {
wilcoxon <- wilcox.test(x, y, paired = FALSE, alternative = "less")
return(wilcoxon$p.value)
}
# Use purrr::map2 to apply the function to corresponding columns
wilcoxons <- purrr::map2(C1[, 1:230], C2[, 1:230], ~ MannWhitneyU(.x, .y))
# Coerce p_values to dataframe and transpose it
p_values <- t(data.frame(wilcoxons))
# Calculate the FDR-adjusted p-values
p_adj <- p.adjust(wilcoxons, method = "fdr")
results <- data.frame(p_values, p_adj)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
dplyr::filter(results, p_adj < 0.05)
p_values p_adj
Lip.TG.52.2. 6.168152e-04 0.0236445812
Lip.TG.52.3. 1.357772e-04 0.0156143833
Lip.TG.52.4. 4.513399e-04 0.0236445812
Lip.TG.54.3. 1.723621e-03 0.0312827654
Lip.TG.54.4. 1.187039e-03 0.0277759895
Lip.TG.54.5. 4.598021e-04 0.0236445812
Lip.TG.54.6. 7.933971e-04 0.0240590581
Lip.TG.56.2. 1.904168e-03 0.0312827654
Lip.TG.56.3. 1.904168e-03 0.0312827654
Lip.TG.56.6. 5.528916e-04 0.0236445812
Lip.TG.56.7. 8.368368e-04 0.0240590581
Lip.TG.56.8. 1.207652e-03 0.0277759895
Lip.TG.58.9. 1.432553e-03 0.0299533905
Lip.DG.36.3. 3.523873e-06 0.0008104908
Carriers of one copy of E4 vs two copies tend to have less histamine, fumaric acid, uracil, triglyceride TG.48.0, phosphatidylcholine PC.36.4, with p < 0.05, however these don’t survive FDRcut at 0.95 (p_adj = 0.97)’
geno$g <- geno$APOE
geno$g[geno$g == "E3E4"] <- geno$g[geno$g == "E2E4"] <- "E4x1"
geno$g[geno$g == "E4E4"] <- "E4x2"
# Store all observations of Class 1 in C1
C1 <- ADmets[geno$g == "E4x1", 1:230]
# Store all observations of Class 2 in C2
C2 <- ADmets[geno$g == "E4x2", 1:230]
# Use purrr::map2 to apply the function to corresponding columns
wilcoxons <- purrr::map2(C1[, 1:230], C2[, 1:230], ~ MannWhitneyU(.x, .y))
# Coerce p_values to dataframe and transpose it
p_values <- t(data.frame(wilcoxons))
# Calculate the FDR-adjusted p-values
p_adj <- p.adjust(wilcoxons, method = "fdr")
results <- data.frame(p_values, p_adj)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
dplyr::filter(results, p_adj < 0.05)
[1] p_values p_adj
<0 rows> (or 0-length row.names)
Testing no E4 vs one E4, no metabolites differ significantly.
geno$g <- geno$APOE
geno$g[geno$g == "E3E4"] <- geno$g[geno$g == "E2E4"] <- "E4x1"
geno$g[geno$APOEb == "E4NO"] <- "E4x0"
# Store all observations of Class 1 in C1
C1 <- ADmets[geno$g == "E4x1", 1:230]
# Store all observations of Class 2 in C2
C2 <- ADmets[geno$g == "E4x0", 1:230]
# Use purrr::map2 to apply the function to corresponding columns
wilcoxons <- purrr::map2(C1[, 1:230], C2[, 1:230], ~ MannWhitneyU(.x, .y))
# Coerce p_values to dataframe and transpose it
p_values <- t(data.frame(wilcoxons))
# Calculate the FDR-adjusted p-values
p_adj <- p.adjust(wilcoxons, method = "fdr")
results <- data.frame(p_values, p_adj)
# Filter out the non-significant (a=0.05) FDR-adjusted p-values
dplyr::filter(results, p_adj < 0.05)
[1] p_values p_adj
<0 rows> (or 0-length row.names)
The models fitted for the binary and the multinomial classification of metabolites didn’t seem to be able to discriminate well between the presence or absence of the E4 allele, nor the number of copies in AD. ### Function fit
fit <- function(title,
X,
y,
model,
ctrl = NULL,
grid = NULL,
seed = 123, ...) {
set.seed(seed)
# Merge X and y into df
df <- merge.data.frame(X, y)
# Train the model
mdl <- caret::train(df[, 1:ncol(X)], df$y,
method = model,
tuneGrid = grid,
trControl = ctrl,
metric = "ROC",
preProcess = c("center", "scale"),
...
)
# Create a confusion matrix and get performance metrics from caret
obs <- mdl$pred$obs
preds <- mdl$pred$pred
cm <- confusionMatrix(obs, preds,
dnn = c("X0", "X1"), # nolint
positive = "X1")
# Predictions
ys <- as.numeric(obs) -1
yhats <- mdl$pred$X1
roc <- roc(ys, yhats,
levels = c(0, 1),
ci = TRUE, boot.n = 1000, ci.alpha = 0.95)
metrics <- data.frame(c(cm$byClass, roc$auc),
row.names = c(names(cm$byClass), "AUC")
)
names(metrics) <- title
out <- list("metrics" = metrics, "roc" = roc, "model" = mdl)
return(out)
}
load("./data.Rdata")
X <- df[df$Diagnosis == "Probable AD", 1:230]
y <- df[df$Diagnosis == "Probable AD", "E4"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))
# Define the model training parameters, repeated 10-fold cross-valuidation
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 50,
classProbs = TRUE,
summaryFunction = twoClassSummary,
savePredictions = TRUE,
sampling = 'smote',
)
lr <- fit(
title = "Logistic Regression",
X = X,
y = y,
model = "glmnet",
ctrl = ctrl,
grid = expand.grid(alpha = c(0), lambda = c(100)))
Loading required package: recipes
Attaching package: 'recipes'
The following object is masked from 'package:Matrix':
update
The following object is masked from 'package:stats':
step
Setting direction: controls < cases
tree <- fit(
title = "Decision Tree",
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = expand.grid(maxdepth = c(2))
)
Setting direction: controls < cases
tree$metrics
Decision Tree
Sensitivity 0.64646876
Specificity 0.30851052
Pos Pred Value 0.13520319
Neg Pred Value 0.83918504
Precision 0.13520319
Recall 0.64646876
F1 0.22363508
Prevalence 0.14326989
Detection Rate 0.09261951
Detection Prevalence 0.68503937
Balanced Accuracy 0.47748964
AUC 0.47755338
param <- data.frame(nrounds = c(10), max_depth = c(2), eta = c(0.3), gamma = c(0), colsample_bytree = c(0.5), min_child_weight = c(1), subsample = c(1))
rf <- fit(title = "XGBoost",
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = param
)
Setting direction: controls > cases
rf$metrics
XGBoost
Sensitivity 0.6034167
Specificity 0.2373725
Pos Pred Value 0.4292642
Neg Pred Value 0.3863780
Precision 0.4292642
Recall 0.6034167
F1 0.5016558
Prevalence 0.4873297
Detection Rate 0.2940629
Detection Prevalence 0.6850394
Balanced Accuracy 0.4203946
AUC 0.6297622
#Get a performance metrics table
metrics <- cbind(lr$metrics, tree$metrics, rf$metrics)
metrics
Logistic Regression Decision Tree XGBoost
Sensitivity 0.6850589 0.64646876 0.6034167
Specificity 0.3151365 0.30851052 0.2373725
Pos Pred Value 0.9000815 0.13520319 0.4292642
Neg Pred Value 0.1000000 0.83918504 0.3863780
Precision 0.9000815 0.13520319 0.4292642
Recall 0.6850589 0.64646876 0.6034167
F1 0.7779864 0.22363508 0.5016558
Prevalence 0.9000558 0.14326989 0.4873297
Detection Rate 0.6165912 0.09261951 0.2940629
Detection Prevalence 0.6850394 0.68503937 0.6850394
Balanced Accuracy 0.5000977 0.47748964 0.4203946
AUC 0.5000407 0.47755338 0.6297622
#Plot ROC curves
rocs <- list(lr$roc, tree$roc, rf$roc)
# Generate labels
labels <- paste0(names(metrics), ", AUC = ", paste(round(metrics[12, ], 2)))
# httpgd::hgd()
ggroc(rocs) +
theme_clean() +
scale_color_tableau(labels = labels)
multifit <- function(
X,
y,
model,
ctrl = NULL,
grid = NULL,
seed = 123, ...) {
set.seed(seed)
# Merge X and y into df
df <- merge.data.frame(X, y)
# Train the model
mdl <- caret::train(df[, 1:ncol(X)], df$y,
method = model,
tuneGrid = grid,
trControl = ctrl,
metric = "logLoss",
...
)
# Create a confusion matrix and get performance metrics from caret
obs <- mdl$pred$obs
preds <- mdl$pred$pred
cm <- confusionMatrix(obs, preds)
# Predictions
ys <- as.numeric(obs) -1
yhats <- as.numeric(preds) -1
roc <- multiclass.roc(ys, yhats)
out <- list("cm" = cm, "roc" = roc, "model" = mdl)
return(out)
}
X <- df[df$Diagnosis == "Probable AD", 1:230]
y <- df[df$Diagnosis == "Probable AD", "APOE"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 50,
classProbs = TRUE,
summaryFunction = multiClassSummary,
savePredictions = TRUE,
sampling = "up"
)
mlr <- multifit(
X = X,
y = y,
model = "multinom",
ctrl = ctrl,
trace = FALSE,
tuneLength = 1
)
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
mlr$cm
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 66858 83103 104039
X1 142750 87747 156853
X2 63114 56273 45713
Overall Statistics
Accuracy : 0.2484
95% CI : (0.2475, 0.2493)
No Information Rate : 0.3802
P-Value [Acc > NIR] : 1
Kappa : -0.1047
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.2452 0.3863 0.14909
Specificity 0.6494 0.4828 0.76115
Pos Pred Value 0.2632 0.2265 0.27688
Neg Pred Value 0.6274 0.6674 0.59321
Prevalence 0.3382 0.2816 0.38019
Detection Rate 0.0829 0.1088 0.05668
Detection Prevalence 0.3150 0.4803 0.20472
Balanced Accuracy 0.4473 0.4346 0.45512
mtree <- multifit(
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = expand.grid(maxdepth=c(2))
)
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
mtree$cm
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 190866 3130 60004
X1 294793 4503 88054
X2 128394 2283 34423
Overall Statistics
Accuracy : 0.2849
95% CI : (0.284, 0.2859)
No Information Rate : 0.7614
P-Value [Acc > NIR] : 1
Kappa : -0.01
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.3108 0.454115 0.18864
Specificity 0.6719 0.519359 0.79057
Pos Pred Value 0.7514 0.011625 0.20850
Neg Pred Value 0.2340 0.987084 0.76915
Prevalence 0.7614 0.012296 0.22628
Detection Rate 0.2367 0.005584 0.04268
Detection Prevalence 0.3150 0.480315 0.20472
Balanced Accuracy 0.4913 0.486737 0.48961
mrf <- multifit(
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = param
)
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
mrf$cm
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 67982 83995 102023
X1 140453 92186 154711
X2 61848 56383 46869
Overall Statistics
Accuracy : 0.2567
95% CI : (0.2558, 0.2577)
No Information Rate : 0.3765
P-Value [Acc > NIR] : 1
Kappa : -0.0949
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.2515 0.3964 0.15438
Specificity 0.6531 0.4857 0.76488
Pos Pred Value 0.2676 0.2380 0.28388
Neg Pred Value 0.6338 0.6650 0.59970
Prevalence 0.3352 0.2884 0.37647
Detection Rate 0.0843 0.1143 0.05812
Detection Prevalence 0.3150 0.4803 0.20472
Balanced Accuracy 0.4523 0.4410 0.45963
#Get a performance metrics table
multimetrics <- cbind(mlr$cm$byClass[3,], mtree$cm$byClass[3,], mrf$cm$byClass[3,])
multimetrics
[,1] [,2] [,3]
Sensitivity 0.14909411 0.18863882 0.15437594
Specificity 0.76115196 0.79057133 0.76487679
Pos Pred Value 0.27688068 0.20849788 0.28388250
Neg Pred Value 0.59321431 0.76914633 0.59969751
Precision 0.27688068 0.20849788 0.28388250
Recall 0.14909411 0.18863882 0.15437594
F1 0.19382029 0.19807182 0.19999445
Prevalence 0.38019096 0.22627689 0.37646847
Detection Rate 0.05668423 0.04268461 0.05811768
Detection Prevalence 0.20472441 0.20472441 0.20472441
Balanced Accuracy 0.45512303 0.48960507 0.45962637
#Plot ROC curves
mrocs <- list(mlr$roc, mtree$roc, mrf$roc)
# Generate labels
labels <- paste0(names(metrics), ", AUC = ", paste(round(metrics[12, ], 2)))
load("thomson.Rdata")
X <- thomson
y <- df[df$Diagnosis == "Probable AD", "E4"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))
ctrl$summaryFunction <- twoClassSummary
lrf <- fit(title = "Logistic Regression",
X = X,
y = y,
model = "glm",
ctrl = ctrl,
)
Setting direction: controls > cases
treef <- fit(
title = "Decision Tree",
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = expand.grid(maxdepth = c(2))
)
Setting direction: controls < cases
param <- data.frame(nrounds = c(10), max_depth = c(2), eta = c(0.3), gamma = c(0), colsample_bytree = 1, min_child_weight = c(1), subsample = c(1))
rff <- fit(title = "XGBoost",
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = param
)
Setting direction: controls > cases
#Get a performance metrics table
metricsf <- cbind(lrf$metrics, treef$metrics, rff$metrics)
metricsf
Logistic Regression Decision Tree XGBoost
Sensitivity 0.6731920 0.6668807 0.6437766
Specificity 0.3029211 0.3103424 0.2725787
Pos Pred Value 0.4953046 0.1973844 0.4761698
Neg Pred Value 0.4770197 0.7855512 0.4269291
Precision 0.4953046 0.1973844 0.4761698
Recall 0.6731920 0.6668807 0.6437766
F1 0.5707078 0.3046099 0.5474315
Prevalence 0.5040213 0.2027590 0.5066898
Detection Rate 0.3393031 0.1352161 0.3261951
Detection Prevalence 0.6850394 0.6850394 0.6850394
Balanced Accuracy 0.4880566 0.4886116 0.4581777
AUC 0.5204983 0.4854656 0.5701351
#Plot ROC curves
rocsf <- list(lrf$roc, treef$roc, rff$roc)
# Generate labels
labelsf <- paste0(names(metricsf), ", AUC = ", paste(round(metricsf[12, ], 2)))
# httpgd::hgd()
ggroc(rocsf) +
theme_clean() +
scale_color_tableau(labels = labels)
y <- df[df$Diagnosis == "Probable AD", "APOE"]
y <- as.factor(y)
levels(y) <- make.names(levels(y))
ctrl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 10,
classProbs = TRUE,
summaryFunction = multiClassSummary,
savePredictions = TRUE,
sampling = "up"
)
mlrf <- multifit(
X = X,
y = y,
model = "multinom",
ctrl = ctrl,
trace = FALSE,
tuneLength = 1
)
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
mlrf$cm
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 16591 14804 19405
X1 27083 20639 29748
X2 11715 9483 11822
Overall Statistics
Accuracy : 0.3041
95% CI : (0.3019, 0.3064)
No Information Rate : 0.378
P-Value [Acc > NIR] : 1
Kappa : -0.0224
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.2995 0.4594 0.1939
Specificity 0.6770 0.5116 0.7887
Pos Pred Value 0.3266 0.2664 0.3580
Neg Pred Value 0.6489 0.7102 0.6168
Prevalence 0.3434 0.2785 0.3780
Detection Rate 0.1029 0.1280 0.0733
Detection Prevalence 0.3150 0.4803 0.2047
Balanced Accuracy 0.4883 0.4855 0.4913
mtreef <- multifit(
X = X,
y = y,
model = "rpart2",
ctrl = ctrl,
grid = mtree$model$bestTune
)
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
mtreef$cm
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 47322 535 2943
X1 72308 746 4416
X2 30949 381 1690
Overall Statistics
Accuracy : 0.3085
95% CI : (0.3062, 0.3108)
No Information Rate : 0.9336
P-Value [Acc > NIR] : 1
Kappa : -0.0029
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.31427 0.448857 0.18676
Specificity 0.67529 0.519358 0.79421
Pos Pred Value 0.93154 0.009630 0.05118
Neg Pred Value 0.06546 0.989072 0.94263
Prevalence 0.93359 0.010304 0.05610
Detection Rate 0.29340 0.004625 0.01048
Detection Prevalence 0.31496 0.480315 0.20472
Balanced Accuracy 0.49478 0.484107 0.49048
mrff <- multifit(
X = X,
y = y,
model = "xgbTree",
ctrl = ctrl,
grid = param
)
Setting direction: controls < cases
Setting direction: controls < cases
Setting direction: controls < cases
mrff$cm
Confusion Matrix and Statistics
Reference
Prediction X0 X1 X2
X0 14192 16860 19748
X1 27594 20049 29827
X2 12116 11091 9813
Overall Statistics
Accuracy : 0.2731
95% CI : (0.271, 0.2753)
No Information Rate : 0.3682
P-Value [Acc > NIR] : 1
Kappa : -0.0746
Mcnemar's Test P-Value : <2e-16
Statistics by Class:
Class: X0 Class: X1 Class: X2
Sensitivity 0.26329 0.4177 0.16524
Specificity 0.65911 0.4932 0.77226
Pos Pred Value 0.27937 0.2588 0.29718
Neg Pred Value 0.64060 0.6665 0.61351
Prevalence 0.33419 0.2976 0.36821
Detection Rate 0.08799 0.1243 0.06084
Detection Prevalence 0.31496 0.4803 0.20472
Balanced Accuracy 0.46120 0.4554 0.46875
# Get a performance metrics table
multimetricsf <- cbind(mlrf$cm$byClass[3, ], mtreef$cm$byClass[3, ], mrff$cm$byClass[3, ])
names(multimetricsf) <- names(metrics)
multimetricsf
[,1] [,2] [,3]
Sensitivity 0.19388274 0.18676097 0.16523540
Specificity 0.78868564 0.79420787 0.77226158
Pos Pred Value 0.35802544 0.05118110 0.29718353
Neg Pred Value 0.61680050 0.94262883 0.61351056
Precision 0.35802544 0.05118110 0.29718353
Recall 0.19388274 0.18676097 0.16523540
F1 0.25154529 0.08034420 0.21238421
Prevalence 0.37804576 0.05610391 0.36820634
Detection Rate 0.07329655 0.01047802 0.06084072
Detection Prevalence 0.20472441 0.20472441 0.20472441
Balanced Accuracy 0.49128419 0.49048442 0.46874849
attr(,"names")
[1] "Logistic Regression" "Decision Tree" "XGBoost"
[4] NA NA NA
[7] NA NA NA
[10] NA NA NA
[13] NA NA NA
[16] NA NA NA
[19] NA NA NA
[22] NA NA NA
[25] NA NA NA
[28] NA NA NA
[31] NA NA NA